#!/usr/bin/env python3

## The purpose of this script is to look up the categories associated with all articles in a sample.

# 0 - Globals and configs
# 1 -  open a list of article titles
# 2 - make a list of titles to be queried
# 3 - assemble API calls and output results

import requests
import pandas as pd
import urllib
import time

# 0 - Globals and configs

#testing setup
dataStub = 'test/'
infileList = ['testArts.tsv']
outfile = 'catlookups_test.tsv'
DEBUG = 1


#production setup
#dataStub = '/gscratch/comdata/users/kaylea/taboo/processed_data/'
#infileList = ['articlesRandomSample.tsv','articlesTabooSample.tsv','titlesTabooSample.tsv']
#outfile = 'catlookups_test.tsv'

###general globals here

S = requests.Session()
S.headers.update({'User-Agent': 'Kaylea Champion querying category membership, kaylea@uw.edu'})
URL = "https://en.wikipedia.org/w/api.php"

# 1 -  open a list of article titles

if DEBUG:
	print(f"starting query building against {URL}")

titles = []
for file in infileList:
	if DEBUG:
		print(file)
	data = pd.read_csv(dataStub + file, sep='\t')	
	tCol = data['encodedTitle'].tolist()
	if DEBUG:
		print(tCol)
	titles.extend(tCol) ## want one list not a list of lists
	if DEBUG:
		print(titles)

with open(dataStub + outfile, 'w') as fh: #setup outfile
	fh.write(f'title\tencodedTitle\tcategory\n')
#read in just the encodedTitle column
#append column to titles

# 2 - make a list of titles to be queried
fullTitles = []

for title in titles:
#we encode titles with urllib.parse.quote
	title = urllib.parse.unquote(title) 
	title = title.lstrip('"')
	title = title.rstrip('"')
	fullTitles.append(title)
	fullTitles.append('Talk:' + title)

if DEBUG:
	print(fullTitles)



for title in fullTitles:

# 3 - assemble API calls and output results

	time.sleep(1)
	PARAMS = {
	"action": "query",
	"format": "json",
	"prop": "categories",
	"titles": title,
	"maxlag": 5
	}

	request = S.get(url=URL, params=PARAMS)
	myData = request.json()
	if DEBUG:
		print(myData)
	pages = myData["query"]["pages"]

	for k, v in pages.items():
		if DEBUG:
			print(k)
			print(v)
		for cat in v['categories']:
			if DEBUG:
        			print(cat["title"])
			with open(dataStub + outfile, 'a') as fh:
				encodedTitle = urllib.parse.quote('"' + title + '"') 
				fh.write(f'{title}\t{encodedTitle}\t{cat["title"]}\n')

## if there's a continue let's handle that too
	while "continue" in myData:
		time.sleep(1)
		print(f'Found a continue: {myData["continue"]} statement!')
		myPARAMS = dict(PARAMS)
		myPARAMS.update(myData["continue"])
		request = S.get(url=URL, params=myPARAMS)
		myData = request.json()
		if DEBUG:
			print(myData)
		pages = myData["query"]["pages"]

		for k, v in pages.items():
			if DEBUG:
				print(k)
				print(v)
			for cat in v['categories']:
				print(cat["title"])
				with open(dataStub + outfile, 'a') as fh:
					encodedTitle = urllib.parse.quote('"' + title + '"') 
					fh.write(f'{title}\t{encodedTitle}\t{cat["title"]}\n')


